# Loading packages
library(pryr) # Memory usage information
library(lubridate) # date manipulation
Attaching package: ‘lubridate’
The following object is masked from ‘package:base’:
date
library(dplyr) # data manipulation
Attaching package: ‘dplyr’
The following objects are masked from ‘package:lubridate’:
intersect, setdiff, union
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(tidyr) # data manipulation
library(ggplot2) # graphics
library(plotly) # Interactive graphics
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(stringr) # string manipulation
library(stringi) # remove diacritics
library(ggfortify)
library(rjson) # read json
library(nasapower) # nasapower api data
library(ggsci) # Scientific color palletes
library(zoo) # Replace NAs in time series analysis
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
library(rgdal) # Work with geo-spatial data
Carregando pacotes exigidos: sp
rgdal: version: 1.4-4, (SVN revision 833)
Geospatial Data Abstraction Library extensions to R successfully loaded
Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
Path to GDAL shared files: /usr/share/gdal/2.2
GDAL binary built with GEOS: TRUE
Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
Path to PROJ.4 shared files: (autodetected)
Linking to sp version: 1.3-1
library(sf) # geo-spatial vector data
Linking to GEOS 3.6.2, GDAL 2.2.3, PROJ 4.9.3
library(TSA) # Time Series Analys
Attaching package: ‘TSA’
The following objects are masked from ‘package:stats’:
acf, arima
The following object is masked from ‘package:utils’:
tar
library(qwraps2)# for pretty summary statistics
There are 4 csv files containing data about the fires of all ocurrences around all brazilian territory. Each file has data between 1st Sept to 30th Sept of each year from 2015 to 2019.
temp <- list.files(path = "./database/modis_15-19" ,pattern = "*.csv")
myfiles = lapply(paste0("./database/modis_15-19/",temp), read.csv)
nrecords <- lapply(myfiles, nrow)
data <- bind_rows(myfiles)
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorUnequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorUnequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
names(data)
[1] "datahora" "satelite" "pais" "estado" "municipio" "bioma"
[7] "diasemchuva" "precipitacao" "riscofogo" "latitude" "longitude" "frp"
# As we have many data records, we'll be working with a small subset of it
df <- data[sample(nrow(data), 0.1*nrow(data)), ]
totalNo <- nrow(df)
object_size(df)
8.32 MB
there are 820929 observations in our dataset which sums up to 66.8 MB of storaged data.
str(data)
'data.frame': 885117 obs. of 12 variables:
$ datahora : chr "2015/08/11 17:18:00" "2015/08/14 17:48:00" "2015/08/15 16:53:00" "2015/08/03 16:28:00" ...
$ satelite : Factor w/ 1 level "AQUA_M-T": 1 1 1 1 1 1 1 1 1 1 ...
$ pais : Factor w/ 1 level "Brasil": 1 1 1 1 1 1 1 1 1 1 ...
$ estado : chr "PARA" "AMAZONAS" "PARA" "PARA" ...
$ municipio : chr "ITAITUBA" "APUI" "ALTAMIRA" "SANTA MARIA DAS BARREIRAS" ...
$ bioma : Factor w/ 6 levels "Amazonia","Caatinga",..: 1 1 1 1 3 3 3 3 3 3 ...
$ diasemchuva : int NA NA NA NA NA NA NA NA NA NA ...
$ precipitacao: num 0 0 0 0 0 0 0 0 0 0 ...
$ riscofogo : num 0.96 0.95 1 NaN 1 1 1 1 1 1 ...
$ latitude : num -6.51 -7.01 -6.12 -8.67 -6.51 ...
$ longitude : num -56.1 -59.3 -53.4 -50 -44.8 ...
$ frp : num NA NA NA NA NA NA NA NA NA NA ...
summary(data)
datahora satelite pais estado municipio
Length:885117 AQUA_M-T:885117 Brasil:885117 Length:885117 Length:885117
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
bioma diasemchuva precipitacao riscofogo latitude
Amazonia :436732 Min. : 0.0 Min. : 0.00 Min. :0.00 Min. :-33.667
Caatinga : 56296 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.:0.37 1st Qu.:-12.806
Cerrado :290662 Median : 0.0 Median : 0.00 Median :1.00 Median : -9.244
Mata Atlantica: 73284 Mean : 9.9 Mean : 1.08 Mean :0.71 Mean : -9.954
Pampa : 4985 3rd Qu.: 8.0 3rd Qu.: 0.00 3rd Qu.:1.00 3rd Qu.: -5.826
Pantanal : 23158 Max. :120.0 Max. :309.80 Max. :1.00 Max. : 5.154
NA's :401480 NA's :107875 NA's :137531
longitude frp
Min. :-73.67 Min. : 0.0
1st Qu.:-57.03 1st Qu.: 14.7
Median :-50.97 Median : 27.3
Mean :-52.15 Mean : 58.6
3rd Qu.:-46.23 3rd Qu.: 56.3
Max. :-34.82 Max. :7303.4
NA's :608511
In order to print a well formatted table of summary statistics, lets write a function to do it
options(qwraps2_markup = "latex")
my_summary <-
list("diasemchuva" =
list("min" = ~ min(.data$diasemchuva, na.rm = T),
"mediana" = ~ qwraps2::median_iqr(.data$diasemchuva, na_rm = T),
"média (desv. padr.)" = ~ qwraps2::mean_sd(.data$diasemchuva, na_rm = T),
"máx" = ~ max(.data$diasemchuva, na.rm = T)),
"precipitacao" =
list("min" = ~ min(.data$precipitacao, na.rm = T),
"mediana" = ~ qwraps2::median_iqr(.data$precipitacao, na_rm = T),
"média (desv. padr.)" = ~ qwraps2::mean_sd(.data$precipitacao, na_rm = T),
"máx" = ~ max(.data$precipitacao, na.rm = T)),
"riscofogo" =
list("min" = ~ min(.data$riscofogo, na.rm = T),
"mediana" = ~ qwraps2::median_iqr(.data$riscofogo, na_rm = T),
"média (desv. padr.)" = ~ qwraps2::mean_sd(.data$riscofogo, na_rm = T),
"máx" = ~ max(.data$riscofogo, na.rm = T)),
"frp" =
list("min" = ~ min(.data$frp, na.rm = T),
"mediana" = ~ qwraps2::median_iqr(.data$frp, na_rm = T),
"média (desv. padr.)" = ~ qwraps2::mean_sd(.data$frp, na_rm = T),
"máx" = ~ max(.data$frp, na.rm = T)),
"Bioma" =
list("Amazônia" = ~ qwraps2::n_perc0(.data$bioma == 'Amazonia'),
"Cerrado" = ~ qwraps2::n_perc0(.data$bioma == 'Cerrado'),
"Mata Atlântica" = ~ qwraps2::n_perc0(.data$bioma == 'Mata Atlantica'),
"Pantanal" = ~ qwraps2::n_perc0(.data$bioma == 'Pantanal'),
"Caatinga" = ~ qwraps2::n_perc0(.data$bioma == 'Caatinga'),
"Pampa" = ~ qwraps2::n_perc0(.data$bioma == 'Pampa')
)
)
### Overall
pretty_summary <- summary_table(data, my_summary)
pretty_summary
Aparentemente, A coluna de área industrial e FRP tem poucas entradas não nulas
paste("Percentage of NAs entries on `FRP` =",
100*sum(is.na(df$frp))/totalNo)
[1] "Percentage of NAs entries on `FRP` = 68.7598151642169"
wetDays <- df$diasemchuva <= 10
dryDays <- df$diasemchuva > 10
noWeatherInfo <- is.na(df$diasemchuva)
paste("Percentage of NAs entries on `diasemchuva` =",
round(100*(sum(noWeatherInfo))/totalNo,2))
[1] "Percentage of NAs entries on `diasemchuva` = 45.43"
paste("Percentage of records with less than (or equal to) 10 days of no rain =",
round(100*(sum(wetDays, na.rm = T))/totalNo,2))
[1] "Percentage of records with less than (or equal to) 10 days of no rain = 42.79"
paste("Percentage of records with more than 10 days of no rain =",
round(100*(sum(dryDays, na.rm = T))/totalNo,2))
[1] "Percentage of records with more than 10 days of no rain = 11.78"
By the last results, it is possible that the number of days with no rain have a huge impact over the fires triggers.
we can also investigate another columns that have either redundant data or not useful data.
levels(df$pais)
[1] "Brasil"
levels(df$satelite)
[1] "AQUA_M-T"
df$pais <- NULL
df$satelite <- NULL
object_size(df)
7.61 MB
By looking at the structure of our dataframe, it becomes clear that the Datahora is a datetime object not a Factor.
df$datahora <- ymd_hms(str_replace_all(df$datahora,"/","-"))
levels(df$bioma)
[1] "Amazonia" "Caatinga" "Cerrado" "Mata Atlantica" "Pampa"
[6] "Pantanal"
We have 6 distincts biomas on our dataset. Perhaps, by knowing how is the distribution of fires among different biomas, we can refine our research.
counts <- table(df$bioma)
counts
Amazonia Caatinga Cerrado Mata Atlantica Pampa Pantanal
43818 5547 28905 7379 504 2358
proportions <- 100*counts / sum(counts)
proportions
Amazonia Caatinga Cerrado Mata Atlantica Pampa Pantanal
49.5057112 6.2670177 32.6569579 8.3368169 0.5694207 2.6640757
As we can see, Amazonia represents around half of our dataset. In addition, the number of fire occurrences is very much like the proportion of land that each bioma has in brazilian territory.
————————– Visualisations —————————–
# For text in plots see
# http://r-statistics.co/Complete-Ggplot2-Tutorial-Part2-Customizing-Theme-With-R-Code.html
# For legend in plots see
# https://www.datanovia.com/en/blog/ggplot-legend-title-position-and-labels/
proportions <- data %>% group_by(bioma) %>% summarise(n = round(100*n()/nrow(.), 1))
quantities <- as.character(sort(desc(proportions$n)))
ggplot(proportions, aes(x = reorder(bioma, -n), y = n, fill = bioma)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title="Proporção de observações por Bioma", y="Proporção %", x="Bioma", caption="Fonte: Elaborado pelo Autor") +
theme(plot.title=element_text(size=10,
face="bold",
family="Arial",
hjust=0.5,
lineheight=1.2), # title
plot.subtitle=element_text(size=10,
family="Arial",
face="bold",
hjust=0.5), # subtitle
axis.title.x=element_text(size=8), # X axis title
axis.title.y=element_text(size=8), # Y axis title
axis.text.x=element_text(size=6,
vjust=.5), # X axis text
axis.text.y=element_text(size=6),
legend.title = element_blank(),
legend.position = "none",
plot.caption = element_text(size = 6)) + # Y axis text
ggsave("/home/alexandre/Documentos/tcc/images/plots_cap3/bioma-proportions.png", dpi = "retina", width = 10, height = 8, units = "cm")
Now, it is also important to get the underlying trend of the number of fire ocurrences through the months of the year. Let’s start exploring this idea:
df %>%
mutate(month = floor_date(date(datahora), "month")) %>%
group_by(month, bioma) %>%
summarise(n_fires = n()) %>%
ggplot(aes(x = month, y = n_fires/1000)) +
geom_bar(aes(fill = bioma), stat = 'identity') +
scale_x_date(date_labels = "%b/%y", date_breaks = "6 months") +
labs(y = "Number of fires in thousands", title = 'Brazilian Wildfires by Year')
In order to plot the number of fires by state, we must do some data wrangling to put the data in the desired format.
is.odd <- function(x) x %% 2 != 0
is.even <- function(x) x %% 2 == 0
geo_data <- fromJSON(file = "./geo_data/geo_data_states.json")
maps_df <- list()
for (i in 1:length(geo_data$features)) {
coordinates <- unlist(geo_data$features[[i]]$geometry$coordinates[[1]])
long <- coordinates[is.odd(seq_along(coordinates))]
lat <- coordinates[is.even(seq_along(coordinates))]
if(length(long) != length(lat)){
stop("Wrong subset of coordinate vector", call. = FALSE)
}
geo_length <- length(long)
estado <- toupper(geo_data$features[[i]]$properties$name)
codigo <- as.integer(geo_data$features[[i]]$properties$codigo_ibg)
regiao <- as.integer(geo_data$features[[i]]$properties$regiao_id)
maps_df[[i]] <- as_tibble(
list(
estado = rep(stri_trans_general(estado,"Latin-ASCII"), geo_length),
codigo = rep(codigo, geo_length),
regiao = rep(regiao, geo_length),
longitude = long,
latitude = lat
)
)
}
map_df <- bind_rows(maps_df)
# the `estado` column of the map dataframe is a character, so in order
# to avoid coertion from factor to character, let's convert it explicitly
df$estado <- as.character(df$estado)
df %>%
select(estado) %>%
group_by(estado) %>%
summarise(n = n()) %>%
right_join(map_df, by = 'estado') %>%
ggplot(aes(x = longitude, y = latitude, group = codigo, fill = n/1000)) +
geom_polygon() +
geom_path(color = 'white', size = 0.1) +
scale_fill_continuous(low = "orange",
high = "darkred",
name = 'Number of fires\n(thousands)'
)
As we can see, the state with the most fire ocurrences is Pará in which most of its lands belongs to the amazon ecossystem, this may explain why half the dataset ocurrences comes from Amazon fires.
As we can see below, this trend happens all years.
df %>%
select(estado, datahora) %>%
mutate(ano = year(datahora)) %>%
group_by(estado, ano) %>%
summarise(n = n()) %>%
right_join(map_df, by = 'estado') %>%
ggplot(aes(x = longitude, y = latitude, group = codigo, fill = n/1000)) +
geom_polygon() +
facet_wrap(~ano) +
geom_path(color = 'white', size = 0.1) +
scale_fill_continuous(low = "orange",
high = "darkred",
name = 'Number of fires\n(thousands)'
)
In terms of raw numbers, we can visualize this difference below.
df %>%
group_by(estado) %>%
ggplot(aes(x = factor(estado))) +
geom_bar() +
coord_flip()
Going down one level, let’s try an analysis by county
geo_data_counties <- fromJSON(file = "./geo_data/geo_data_counties.json")
maps_df_counties <- list()
for (i in 1:length(geo_data_counties$features)) {
coordinates <- unlist(geo_data_counties$features[[i]]$geometry$coordinates[[1]])
long <- coordinates[is.odd(seq_along(coordinates))]
lat <- coordinates[is.even(seq_along(coordinates))]
if(length(long) != length(lat)){
stop("Wrong subset of coordinate vector", call. = FALSE)
}
geo_length <- length(long)
municipio <- geo_data_counties$features[[i]]$properties$name
municipio <- toupper(stri_trans_general(municipio, "Latin-ASCII"))
codigo <- as.integer(geo_data_counties$features[[i]]$properties$id)
maps_df_counties[[i]] <- as_tibble(
list(
municipio = rep(stri_trans_general(municipio, "Latin-ASCII"), geo_length),
codigo = rep(codigo, geo_length),
longitude = long,
latitude = lat
)
)
}
map_df_county <- bind_rows(maps_df_counties)
df %>%
select(municipio) %>%
group_by(municipio) %>%
summarise(n = n()) %>%
right_join(map_df_county, by = 'municipio') %>%
ggplot(aes(x = longitude, y = latitude, group = codigo, fill = n)) +
geom_polygon() +
geom_path(color = 'white', size = 0.1) +
scale_fill_continuous(low = "orange",
high = "darkred",
name = 'Number of fires'
)
What about that total number of fire ocurrences by state? From when does it come from?
df %>%
mutate(ano = year(datahora)) %>%
group_by(estado, ano) %>%
ggplot(aes(x = factor(estado), fill = factor(ano))) +
geom_bar() +
coord_flip()
Let’s repeat the last plot, but now considering the proportions
df %>%
mutate(ano = year(datahora)) %>%
group_by(estado, ano) %>%
ggplot(aes(x = factor(estado), fill = factor(ano))) +
geom_bar(position = "fill") +
coord_flip()
Let’s reinforce what we’ve seen by far and plot the data considering a monthly frequency
# Capitalizes names
capitalize <- function(x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1,1)), substring(s, 2),
sep="", collapse=" ")
}
levels <- c("Janeiro", "Fevereiro", "Março", "Abril", "Maio", "Junho", "Julho", "Agosto", "Setembro", "Outubro", "Novembro", "Dezembro")
df %>%
mutate(mes = factor(sapply(months(datahora), capitalize), levels = levels)) %>%
group_by(mes) %>%
ggplot(aes(x = factor(mes), y = 100*..prop.., group = 1)) +
geom_bar()
df %>%
mutate(mes = factor(sapply(months(datahora), capitalize), levels = levels),
ano = factor(year(datahora))) %>%
group_by(ano, mes) %>%
ggplot(aes(x = mes, y = 100*..prop.., group = 1)) +
geom_bar() +
facet_wrap(~ano) + coord_flip()
As we can see, in 2019 the trend was a bit different, maybe it is so because of the popular protests against the liberal and progressive view of brazilian government towards amazonia.
Let’s compact this plot in only one frame:
levels <- c("Jan", "Fev", "Mar", "Abr", "Mai", "Jun", "Jul", "Ago", "Set", "Out", "Nov", "Dez")
df %>%
mutate(mes = factor(sapply(substr(months(df$datahora), 1, 3), capitalize), levels = levels),
ano = factor(year(datahora))) %>%
group_by(ano, mes) %>%
ggplot(aes(x = mes, fill = ano)) +
geom_bar() +
facet_wrap(~ano) +
coord_flip()
df %>%
mutate(mes = factor(sapply(substr(months(df$datahora), 1, 3), capitalize), levels = levels),
ano = factor(year(datahora))) %>%
group_by(ano, mes) %>%
ggplot(aes(x = mes, fill = ano)) +
geom_bar(position = "dodge")
df %>%
mutate(date = date(datahora)) %>%
group_by(date) %>%
ggplot(aes(x = date, group = 1)) +
geom_freqpoly(stat = "count") +
scale_x_date(date_breaks = "6 months", date_labels = "%b/%y")
As we can see above, our data presents strong patterns (seasonality). By doing a Fourier Transform, it is possible get to know what are the main frequencies of this time series.
signal_INPE <- df %>%
mutate(date = date(datahora)) %>%
group_by(date) %>%
summarise(n = n())
p <- periodogram(signal_INPE$n)
dd = data.frame(freq=p$freq, spec=p$spec)
order = dd[order(-dd$spec),]
top3 = head(order, 3)
# display the 2 highest "power" frequencies
top3
time <- 1/top3$freq
time
[1] 345.60000 172.80000 90.94737
Therefore, the seasonality of our dataset is given by annualy, semesterly and quaterly periods. For reference, see https://anomaly.io/detect-seasonality-using-fourier-transform-r/index.html and http://www.di.fc.ul.pt/~jpn/r/fourier/fourier.html
Strangely, our dataset only has records of fire occurences from 3 to 6 pm. Why is it?
df %>%
mutate(hour = hour(datahora)) %>%
group_by(hour) %>%
ggplot(aes(x = hour, group = 1)) +
geom_bar(stat = "count")
O dataframe original não tem informações sobre macro regiões. Utilizando o data frame de mapas, podemos obter esta informação.
regionsList <- list()
codMacroRegions <- list()
n_reg <- length(unique(map_df$regiao))
regions <- unique(map_df$regiao)
for(i in 1:n_reg){
temp <- map_df %>% filter(regiao == regions[i])
states <- unique(temp$estado)
regionsList[[i]] <- states
codMacroRegions[[i]] <- regions[i]
}
names(regionsList) <- c("NORTE", "NORDESTE", "SUDESTE", "CENTRO-OESTE", "SUL")
for(i in 1:n_reg){
df$macroRegiao[df$estado %in% regionsList[[i]]] <- names(regionsList)[i]
df$codMacroRegiao[df$estado %in% regionsList[[i]]] <- codMacroRegions[[i]]
}
# O nome regiao do dataframe map_df nao tem o mesmo sentido que no dataframe df
# sendo assim, criou-se um dataframe auxiliar com os nomes correspondentes
temp <- df[,c("macroRegiao", "codMacroRegiao")]
names(temp) <- c("macroregiao", "regiao")
temp %>%
group_by(macroregiao, regiao) %>%
summarise(n = n()) %>%
right_join(map_df, by = 'regiao') %>%
ggplot(aes(x = longitude, y = latitude, group = regiao, fill = n/1000)) +
geom_polygon() +
geom_path(color = 'white', size = 0.1) +
scale_fill_continuous(low = "orange",
high = "darkred",
name = 'Number of fires\n(thousands)'
)
na.omit(df) %>% ggplot(aes(x = frp, y = precipitacao, alpha = riscofogo)) +
geom_point()
na.omit(df) %>% ggplot(aes(x = log(frp), y = precipitacao, alpha = riscofogo)) +
geom_point()
Notice the relationship among the fire radiative power, the precipitation and the risk of fire. It seems that, with more precipitation, the frp is reduced while with less precipitation the risk of fire remains mostly
na.omit(df) %>% ggplot(aes(x = riscofogo, y = precipitacao, alpha = frp)) +
geom_point(position = "jitter")
Looking at the distribution of the fire radiative power across different biomas, we can see that it is roughly the same, except for Mata Atlantica and Pampa maybe because Pampa is situated at the south brazil, where temperatures are lower which decreases the chances of fire ocurrences. On the other side, Mata Atlantica is mostly distributed along the coast, implying that the humidity that comes from the ocean may hold fire ocurrences from stroke.
na.omit(df) %>% ggplot(aes(x = bioma, y = log(frp))) +
geom_boxplot()
# link for reference https://jfly.uni-koeln.de/color/
# The palette with grey:
cbp1 <- c("#D55E00", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#999999", "#CC79A7")
na.omit(df) %>% ggplot(aes(x = log(frp), col = bioma, fill = bioma)) +
geom_density(alpha = 0.1) +
scale_color_manual(values = cbp1) +
scale_fill_manual(values = cbp1)
As we can see below, we cannot take any conclusion about the risk of fire with respect to the fire radiative power. So sad.
na.omit(df) %>%
ggplot(aes(x = riscofogo, y = log(frp), col = bioma)) +
geom_point() + # Copy from Plot 1
geom_smooth(method = "lm", se = TRUE) +
geom_smooth(aes(group = 1), method = "lm", se = TRUE, linetype = 2)
data19 <- df %>% filter(year(datahora) == '2019')
df %>%
select(estado) %>%
group_by(estado) %>%
summarise(n = n()) %>%
right_join(map_df, by = 'estado') %>%
ggplot(aes(x = longitude, y = latitude, group = codigo)) +
geom_polygon() +
geom_point(inherit.aes = F, data = data19, aes(x = longitude, y = latitude, col = bioma), size = 0.1, shape = 3) +
scale_color_manual(values = cbp1) +
geom_path(color = 'white', size = 0.1)
What about cumulative sums?
accum_fires_inpe <- df %>%
mutate(data = date(datahora),
ano = year(datahora)) %>%
select(ano, data) %>%
group_by(ano, data) %>%
summarise(occurrence = n()) %>%
mutate(cum_sum = cumsum(occurrence),
dia_ano = yday(data))
accum_fires_inpe %>% ggplot(aes(x = data, y = cum_sum)) +
geom_line() +
scale_x_date(date_breaks = "4 months", date_labels = "%b/%y")
splitted <- accum_fires_inpe %>% select(ano, dia_ano, cum_sum)
# Using `do` function to replace NAs values with the last records value
tidy <- splitted %>% spread(key = ano, value = cum_sum) %>% do(zoo::na.locf(.))
tidy$`2019`[273:nrow(tidy)] <- NA
tidy <- tidy %>%
gather("2015", "2016", "2017", "2018", "2019", key = "year", "value" = fires)
tidy %>%
ggplot(aes(x = dia_ano, y = fires, col = year)) +
geom_line() +
scale_color_manual(values = c("grey", "grey", "grey", "grey", "blue")) +
scale_x_continuous("Day of the year", breaks = seq(from = 0, to = 366, by = 50)) +
scale_y_continuous("Number of fires")
As seen before, our dataset lacks of 70% information about the fire radiative power of fires. In order to analyze this variable with confident outcomes, we need other data sources. From data collected from Nasa Modis project it is possible to enhance the analysis with much more data.
# Find the documentation at https://earthdata.nasa.gov/earth-observation-data/near-real-time/firms/active-fire-data
# Load the archived data (before 1st aug/19)
nasaModis1 <- read.csv("./database/DL_FIRE_M6_79475/fire_archive_M6_79475.csv")
# Load non standard science quality data (after 1st aug/19)
nasaModis2 <- read.csv("./database/DL_FIRE_M6_79475/fire_nrt_M6_79475.csv")
str(nasaModis1)
'data.frame': 1454832 obs. of 15 variables:
$ latitude : num -20.2 -20.4 -20.2 -18.1 -20 ...
$ longitude : num -40.2 -40.9 -40.2 -42.8 -42 ...
$ brightness: num 309 305 306 305 302 ...
$ scan : num 1.7 1.5 1.7 1.3 1.3 1.7 1.6 1.6 1.1 1.1 ...
$ track : num 1.3 1.2 1.3 1.1 1.1 1.3 1.3 1.3 1.1 1.1 ...
$ acq_date : Factor w/ 1672 levels "2015-01-01","2015-01-02",..: 1 1 1 1 1 1 1 1 1 1 ...
$ acq_time : int 143 143 143 144 144 145 145 145 146 146 ...
$ satellite : Factor w/ 2 levels "Aqua","Terra": 2 2 2 2 2 2 2 2 2 2 ...
$ instrument: Factor w/ 1 level "MODIS": 1 1 1 1 1 1 1 1 1 1 ...
$ confidence: int 50 63 28 63 46 56 42 67 42 77 ...
$ version : num 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 6.2 ...
$ bright_t31: num 293 289 293 291 288 ...
$ frp : num 13.1 13.1 10.3 8.9 7 10.9 9.3 17.2 5 9.7 ...
$ daynight : Factor w/ 2 levels "D","N": 2 2 2 2 2 2 2 2 2 2 ...
$ type : int 2 0 2 0 0 0 0 0 0 0 ...
str(nasaModis2)
'data.frame': 194052 obs. of 14 variables:
$ latitude : num -22.9 -22.7 -22 -20.8 -20.2 ...
$ longitude : num -43.7 -42.7 -42.9 -41.1 -41.8 ...
$ brightness: num 308 304 303 315 308 ...
$ scan : num 1 1 1 1.2 1.1 1.3 1.5 2.1 2.1 1 ...
$ track : num 1 1 1 1.1 1 1.1 1.2 1.4 1.4 1 ...
$ acq_date : Factor w/ 61 levels "2019-08-01","2019-08-02",..: 1 1 1 1 1 1 1 1 1 1 ...
$ acq_time : int 135 135 135 135 135 135 135 135 135 135 ...
$ satellite : Factor w/ 2 levels "Aqua","Terra": 2 2 2 2 2 2 2 2 2 2 ...
$ instrument: Factor w/ 1 level "MODIS": 1 1 1 1 1 1 1 1 1 1 ...
$ confidence: int 75 56 55 90 74 62 46 100 76 88 ...
$ version : Factor w/ 1 level "6.0NRT": 1 1 1 1 1 1 1 1 1 1 ...
$ bright_t31: num 292 288 288 288 289 ...
$ frp : num 8.9 7.1 6.8 17.5 10.2 14.9 11.9 62.5 31.6 14 ...
$ daynight : Factor w/ 2 levels "D","N": 2 2 2 2 2 2 2 2 2 2 ...
In order to accomplish an accurate analysis in this dataset, we need to filter some rows. By doing so, we can get close to the data quality of INPE’s dataset.
First of all, the type describes the Inferred hot spot type and assumes 4 values: 0 (presumed vegetation fire), 1 (active volcano), 2 (other statis land source), 3 (offshore). Therefore, we’re going the filter the data for type = 0.
Another important thing to note is the confidence variable. This value is based on a collection of intermediate algorithm quantities used in the detection process. It is intended to help users gauge the quality of individual hotspot/fire pixels. Confidence estimates range between 0 and 100% and are assigned one of the three fire classes (low-confidence fire, nominal-confidence fire, or high-confidence fire).
As a consequence, I am going to consider only the records with more than 50% of confidence.
Additionally, notice that the non-standard science quality dataset has no information about the type of the fire. Let’s assume all fires from these bunch are vegetation fires. With this, to bind the two datasets, we need to filter the nasaModis1 to get only type 0 fires and then drop this column since the nasaModis2 does not have this variable.
nasaModis1 <- nasaModis1 %>% filter(type == 0)
nasaModis1$type <- NULL
nasaModis2$version <- as.numeric(nasaModis2$version)
nasaModis <- bind_rows(nasaModis1, nasaModis2)
Unequal factor levels: coercing to characterbinding character and factor vector, coercing into character vectorbinding character and factor vector, coercing into character vector
modis <- nasaModis %>%
filter(confidence > 50)
the acq_date variable is of type factor, it needs to be converted to a standard date type;
modis$acq_date <- as.Date(modis$acq_date)
The filtered data corresponds to 78.3% of the hole dataset. By the summary statistics of the data, we clearly don’t need the columns satellite, instrument, version and type. In addition, the scan and track columns do not provide us with useful information. Hence, they will also be forget.
modis[,c("scan", "track", "satellite", "instrument", "version", "type")] <- NULL
Let’s a plot like the acummulated fires vs day of the year, and also accumulated frp vs day of the year
accum_fires_nasa <- modis %>%
mutate(acq_year = year(acq_date)) %>%
select(acq_year, acq_date) %>%
group_by(acq_year, acq_date) %>%
summarise(occurrences = n()) %>%
mutate(cum_sum = cumsum(occurrences),
day_year = yday(acq_date))
accum_fires_nasa %>% ggplot(aes(x = acq_date, y = cum_sum)) +
geom_line() +
scale_x_date(date_breaks = "4 months", date_labels = "%b/%y")
Wrangling data to plot accumulated fires vs day of the year
splitted <- accum_fires_nasa %>% select(acq_year, day_year, cum_sum)
# Using `do` function to replace NAs values with the last records value
tidy <- splitted %>% spread(key = acq_year, value = cum_sum) %>% do(zoo::na.locf(.))
tidy$`2019`[273:nrow(tidy)] <- NA
tidy <- tidy %>%
gather("2015", "2016", "2017", "2018", "2019", key = "acq_year", "value" = fires)
tidy %>%
ggplot(aes(x = day_year, y = fires / 1000, col = acq_year)) +
geom_line() +
scale_color_manual(values = c("grey", "grey", "grey", "grey", "red")) +
scale_x_continuous("Day of the year", breaks = seq(from = 0, to = 366, by = 50)) +
scale_y_continuous("Number of fires in thousands")
Now let’s wrap over the frp variable
accum_frp_nasa <- modis %>%
mutate(acq_year = year(acq_date)) %>%
select(acq_year, acq_date, frp) %>%
group_by(acq_year, acq_date) %>%
summarise(frp = sum(frp)) %>%
mutate(cum_frp = cumsum(frp),
day_year = yday(acq_date))
accum_frp_nasa %>% ggplot(aes(x = acq_date, y = cum_frp/1000000)) +
geom_line() +
scale_x_date(date_breaks = "4 months", date_labels = "%b/%y")
splitted <- accum_frp_nasa %>% select(acq_year, day_year, cum_frp)
# Using `do` function to replace NAs values with the last records value
tidy <- splitted %>% spread(key = acq_year, value = cum_frp) %>% do(zoo::na.locf(.))
tidy$`2019`[273:nrow(tidy)] <- NA
tidy <- tidy %>%
gather("2015", "2016", "2017", "2018", "2019", key = "acq_year", "value" = frp)
tidy %>%
ggplot(aes(x = day_year, y = frp / 1000000, col = acq_year)) +
geom_line() +
scale_color_manual(values = c("grey", "grey", "grey", "grey", "red")) +
scale_x_continuous("Day of the year", breaks = seq(from = 0, to = 366, by = 50)) +
scale_y_continuous("Fire Radiative power in millions")
The result for MODIS corroborate with those from INPE.
One thing that we do not see until this point was the average fire radiative power along the days of the year. However, as we have many extreme cases, i.e., outliers, we are gonna remove them before plotting the statistics
q1 <- quantile(modis$frp, .25)
q3 <- quantile(modis$frp, .75)
iqr <- IQR(modis$frp)
mild_low <- q1 - 1.5 * iqr
mild_high <- q3 + 1.5 * iqr
clean_modis <- modis[modis$frp > mild_low & modis$frp < mild_high, ] #rows
mean_frp_nasa <- clean_modis %>%
mutate(acq_year = year(acq_date),
acq_week = week(acq_date)) %>%
select(acq_year, acq_week, frp) %>%
group_by(acq_year, acq_week) %>%
summarise(mean_frp = mean(frp))
splitted <- mean_frp_nasa %>% select(acq_year, acq_week, mean_frp)
# Using `do` function to replace NAs values with the last records value
tidy <- splitted %>% spread(key = acq_year, value = mean_frp) %>% do(zoo::na.locf(.))
tidy$`2019`[40:nrow(tidy)] <- NA
tidy <- tidy %>%
gather("2015", "2016", "2017", "2018", "2019", key = "acq_year", "value" = mean_frp)
tidy %>%
ggplot(aes(x = acq_week, y = mean_frp, col = acq_year)) +
geom_line() +
scale_color_manual(values = c("grey", "grey", "grey", "grey", "red")) +
scale_x_continuous("Week of the year", breaks = seq(from = 0, to = 52, by = 4)) +
scale_y_continuous("Fire Radiative power")
clean_modis %>%
group_by(acq_date) %>%
summarise(mean_frp = mean(frp)) %>%
ggplot(aes(x = acq_date, y = mean_frp)) +
geom_line() +
scale_x_date(date_breaks = "4 months", date_labels = "%b/%y")
this graph shows that there 9s a strong seasonality in the mean of the fire radiative power which has strongs components of periods Ts = 360 and Ts = 180 (Ts = sampling period).
Let’s investigate the fires of the Amazonia during the fire season:
Cumulative active fire detections from May 1st through October 1st from MODIS (Aqua + Terra) and VIIRS (SNPP) show that fire detections in 2019 have fallen below cumulative levels of fire activity detected in 2012 (the start of the VIIRS record) and 2017 across the Legal Amazon. Through the end of September, fires in 2019 were more intense than any year since 2012, measured in terms of fire radiative power, consistent with the observed increase in deforestation this year.
shape <- read_sf(dsn = "./geo_data/legal_amazon_simple", layer = "UF_AMLeg_LLwgs84_Simple")
# Shapefile of amazon legal area can also be obtained at http://www.dpi.inpe.br/Ambdata/unidades_administrativas.php
shapeINPE <- read_sf(dsn = "./geo_data/AmLeg_LLwgs84", layer = "amazlegal")
legal_amazon <- as.data.frame(shape$geometry[[1]][[1]])
names(legal_amazon) <- c("longitude", "latitude")
legal_amazon_INPE <- as.data.frame(shapeINPE$geometry[[1]][[1]][[1]])
names(legal_amazon_INPE) <- c("longitude", "latitude")
legal_amazon %>% ggplot(aes(x = longitude, y = latitude)) +
geom_polygon()
legal_amazon_INPE %>% ggplot(aes(x = longitude, y = latitude)) +
geom_polygon()
# To see if a point is inside a polygon
# point.in.polygon(-60, -5, shape$geometry[[1]][[1]][,1], shape$geometry[[1]][[1]][,2])
df %>%
select(estado) %>%
group_by(estado) %>%
summarise(n = n()) %>%
right_join(map_df, by = 'estado') %>%
ggplot(aes(x = longitude, y = latitude, group = codigo)) +
geom_polygon() +
geom_point(inherit.aes = F, data = data19, aes(x = longitude, y = latitude, col = bioma), size = 0.1, shape = 3) +
scale_color_manual(values = cbp1) +
geom_path(color = 'white', size = 0.1) +
geom_path(inherit.aes = F, data = legal_amazon_INPE, mapping = aes(x = longitude, y = latitude), color = "yellow")
modis_fcount <- read.csv("./database/legal_amazon_fc_frp_20191001/Table_MODIS_fire_counts_2003_2019.csv")
tidy_modis_fcount <- modis_fcount %>%
gather(-`DOY`, key = "year", "value" = fires)
tidy_modis_fcount$year <- as.integer(substr(tidy_modis_fcount$year, 2, 5))
tidy_modis_fcount$fires <- as.integer(tidy_modis_fcount$fires)
ggplotly(tidy_modis_fcount %>%
ggplot(aes(x = DOY, y = fires, color = factor(year))) +
geom_line())
modis_avg_frp <- read.csv("./database/legal_amazon_fc_frp_20191001/Table_MODIS_fire_radiative_power_2003_2019.csv")
tidy_modis_avg_frp <- modis_avg_frp %>%
gather(-`DOY`, key = "year", "value" = fires)
tidy_modis_avg_frp$year <- as.integer(substr(tidy_modis_avg_frp$year, 2, 5))
tidy_modis_avg_frp$fires <- as.integer(tidy_modis_avg_frp$fires)
ggplotly(
tidy_modis_avg_frp %>%
ggplot(aes(x = DOY, y = fires, color = factor(year))) +
geom_line()
)
From NasaPower API we can obtain useful data for about 146 weather and climate parameters, such as:
Relative humidty (at two meters) Temperature (at two meters) Precipitation (mm) Maximum/Minimum Monthly Difference From Monthly Averaged All Sky Insolation Wind Speed
geographical data are provided at the resolution of 1/2 arc degree longitude by 1/2 arc degree latitude. for more detailed info go to https://power.larc.nasa.gov/documents/POWER_Data_v9_methodology.pdf
Another useful resource can be found at https://earthobservatory.nasa.gov/images/145464/fires-in-brazil
Also in https://www.globalfiredata.org/data.html we can find data such as carbon emissions and dry matter emissions.
For highly personalizable images go to https://worldview.earthdata.nasa.gov/
Detailed information about fires on the Legal Amazon area is given at http://www.globalfiredata.org/forecast.html#amazonas
Thoughts on fire forecasting https://www.nasa.gov/feature/goddard/2019/fire-forecasting-from-smart-phone-in-wilderness
Wildfires against biodiversity https://onlinelibrary.wiley.com/doi/abs/10.1111/btp.12267 and https://link.springer.com/article/10.1007/s10531-012-0426-8